#include "mpi.h"
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>

/*	Calcs bandwidth and/or latency between couples of processes */

#define DEF_BW_SIZE (1024*1024) // 4M memory allocation
#define MYBUFSIZE ((4*DEF_BW_SIZE)+4096) // 4M memory allocation
#define MAX_REQ_NUM 100000
#define SIZE_MULT 4
#define DEF_LOOP 1000
#define DEF_LT_SIZE 1 // 1M
#define OUT_PER_RANK 1

MPI_Request request[MAX_REQ_NUM];
MPI_Status rcv_stat[MAX_REQ_NUM];

int main(int argc,char *argv[])
{
	int  myid, numprocs, midd, remote, i, recive_rank;	// handles processes
	int loop, size_bw, size_lt, size;			// original test params
	int output = 0;												// for 1 will print results per rank
	int order, sender = 0;								// hosts coupeling
	int test_bw = 0, test_lt = 0, warmup = 1;					// tests flags
	char test_type[10];										// test type
	char *s_buf, *r_buf;									// send receive buffers
	double t_start = 0.0, t_end = 0.0, t = 0.0;	// time
	double m_bytes, bandwidth, latency; 	// holds test result at sender proc
	double param_bw[32], param_lt[32];		// holds the gathered results at root
	double sum_bw, min_bw, max_bw;				// holds bw tests summaries
	double sum_lt, min_lt, max_lt;				// holds lt tests summaries
	int bw_s_loop = 0, lt_s_loop = 0;			// msg sizes loop
	int skip = 10;	// warmup iterations in latency test, in addition to loop size
	int param_limit = 0, param_skip = 1;	// used at the loop that summarize all tests result,
																				//  will consider results from sender nodes onlyi
	int endless = 0;
	int test_lt_orig;
	int test_bw_orig;
	int size_bw_orig;
	int size_lt_orig;
	int bw_s_loop_orig;
	int lt_s_loop_orig;
	int size_orig;
	int loop_orig;

	MPI_Status stat;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &myid);

	midd = numprocs/2;

         if (numprocs%2!=0) {
          if (myid==0) fprintf(stderr, "\nUsage: only even number of processors allowed\n");
         MPI_Finalize();
         return 0;
        }
                

                loop = DEF_LOOP;
                size_bw = DEF_BW_SIZE;
                size_lt = DEF_LT_SIZE;
                test_bw = test_lt = 1;
                order = 0;
                output = 0;
                warmup = 1;                                                                                            
	
	if ((argc==2)&&(!strcmp(argv[1],"--help")))
   		 argc=3;

	if (argc < 3) // nothing except for exec name, use default args
	{
                strcpy(test_type, argv[argc-1]);
                 if (strcmp(test_type, "no") == 0)
                  {warmup=0;}
                 
         }
	else if (argc < 5)
	{
		if (myid == 0)
		{
			fprintf(stderr, "Usage:  mpi_p loop msg_size test_type order [output]\n\n"
				"\tloop - the number of times to transfer the data\n"
				"\tmsg_size - the number of bytes to transfer (-1 will run over 1 to 4M)\n"
				"\ttest_type=bw|lt|bo - run bandwidth, latency or both\n"
				"\torder=0|1 - senders are: even ranks (i -> i+1), or - "
				"first half of ranks (i -> i+np/2)\n"
				"\toutput=1 - outputs results per sender rank in addition to min, max, avg\n"
                "\twarmup=no|yes - with or without warmup before test, default=yes\nExamples:\n"
				"\t/usr/voltaire/mpi/bin/mpirun_ssh -np 2 -hostfile host_file /usr/voltaire/mpi/bin/mpi_p\n"
				"\t\targs: - none\n"
				"\t\t      - no\n"
				"\t\t      - 1000 5 lt 0 0 no\n"
				"\t\t      - 1000 131072 bw 1 1 no\n"
				"\t\t      - 1000 -1 bo 0 0 no\n");
		}
		MPI_Finalize();
		return 0;
	}
	else
	{
		if(argc > 8) endless = 1;
        if(argc == 7) strcpy(test_type, argv[argc-1]);
        if (strcmp(test_type, "no") == 0){
			warmup=0;
		}
		loop = atoi(argv[1]);
		size = atoi(argv[2]);
		size_lt = size_bw = size;
		strcpy(test_type, argv[3]);
		order = atoi(argv[4]);
	    //output = atoi(argv[argc-2]);
	    if(argc == 6) output = atoi(argv[5]);
		if (order != 1)
		{
			order = 0; // default remote is myid+1 (for even nodes)
		}
		if (size_bw > DEF_BW_SIZE || size_lt > DEF_BW_SIZE)
		{
			fprintf(stderr, "Maximum message size is %d\n", DEF_BW_SIZE);
			MPI_Finalize();
			return 0;
		}
		if (loop > MAX_REQ_NUM)
		{
			fprintf(stderr, "Maximum number of iterations is %d\n", MAX_REQ_NUM);
			MPI_Finalize();
			return 0;
		}
		if (size_bw == -1)
		{
			bw_s_loop = lt_s_loop = 1;
			size_bw = size_lt = DEF_BW_SIZE;
		}
		
		// determine test type
		if (strcmp(test_type, "bo") == 0)
		{
			test_bw = 1;
			test_lt = 1;
		}
		else if (strcmp(test_type, "bw") == 0)
		{
			test_bw = 1;
			test_lt = 0;
		}
		else if (strcmp(test_type, "lt") == 0)
		{
			test_lt = 1;
			test_bw = 0;
		}
		
	}

	// memory allocation (allocate maximum size needed for both tests)
	if (size_bw > size_lt)
	{
		size = size_bw;
	}
	else
	{
		size = size_lt;
	}
	
	s_buf = (char*) malloc (MYBUFSIZE * sizeof(char));
	assert(s_buf);
	
	r_buf = (char*) malloc (MYBUFSIZE * sizeof(char));
	assert(r_buf);

	if(endless){
		test_lt_orig = test_lt;
		test_bw_orig = test_bw;
		size_bw_orig = size_bw;
		size_lt_orig = size_lt;
		bw_s_loop_orig = bw_s_loop;
		lt_s_loop_orig = lt_s_loop;
		size_orig = size;
		loop_orig = loop;
	}
do_for_ever:
	if(endless){
		test_lt = test_lt_orig;
		test_bw = test_bw_orig;
		size_bw = size_bw_orig;
		size_lt = size_lt_orig;
		bw_s_loop = bw_s_loop_orig;
		lt_s_loop = lt_s_loop_orig;
		size = size_orig;
		loop = loop_orig;
	}
	
	MPI_Barrier(MPI_COMM_WORLD);

	// determine which procs will be the senders (even nodes or first half of nodes, according to order arg.)
	//	and see who their talking with (remote)
	if (order == 0)
	{
		param_limit = numprocs;
		param_skip = 2;
		if (myid%2 == 0)
		{
			remote = myid + 1;
			sender = 1;
		}
		else
		{
			remote = myid - 1;
		}
	}
	else
	{
		param_limit = numprocs/2;
		param_skip = 1;
		if (myid < midd)
		{
			remote = myid + midd;
			sender = 1;
		}
		else
		{
			remote = myid - midd;
		}
	}
	
	if (test_bw)
	{
		if (bw_s_loop)
		{
			size_bw = 1;
		}

		// start bandwidth test (first try is for warmup)
		do
		{
			/* warmup */
			if (warmup)
			{
		/*	 if (myid==0) fprintf(stderr,"[%d]warming up\n",myid);*/
			if (sender)
			{
				for (i = 0; i < loop; i++)
				{
					MPI_Isend(s_buf, size_bw, MPI_CHAR, remote, 100, MPI_COMM_WORLD, request + i);
				}
				MPI_Waitall(loop, request, rcv_stat);
				MPI_Recv(r_buf, 4, MPI_CHAR, remote, 101, MPI_COMM_WORLD, &rcv_stat[0]);
			}
			else
			{
				for (i = 0; i < loop; i++)
				{
					MPI_Irecv(r_buf, size_bw, MPI_CHAR, remote, 100, MPI_COMM_WORLD, request + i);
				}
				MPI_Waitall(loop, request, rcv_stat);
				MPI_Send(s_buf, 4,  MPI_CHAR, remote, 101, MPI_COMM_WORLD);
			}
			MPI_Barrier(MPI_COMM_WORLD);
			}
			/* real test */
			if (sender)
			{
				t_start = MPI_Wtime();

				for (i = 0; i < loop; i++)
				{
					MPI_Isend(s_buf, size_bw, MPI_CHAR, remote, 100, MPI_COMM_WORLD, request + i);
				}
				MPI_Waitall(loop, request, rcv_stat);
				MPI_Recv(r_buf, 4, MPI_CHAR, remote, 101, MPI_COMM_WORLD, &rcv_stat[0]);

				t_end = MPI_Wtime();
				t = t_end - t_start;
			}
			else
			{
				for (i = 0; i < loop; i++)
				{
					MPI_Irecv(r_buf, size_bw, MPI_CHAR, remote, 100, MPI_COMM_WORLD, request + i);
				}
				MPI_Waitall(loop, request, rcv_stat);
				MPI_Send(s_buf, 4, MPI_CHAR, remote, 101, MPI_COMM_WORLD);
			}
			
			// calculate result (mbytes/sec)
			if (sender)
			{
				m_bytes = ((size_bw * 1.0) / 1.0e6) * loop;
				bandwidth = m_bytes/t;
			}
			MPI_Barrier(MPI_COMM_WORLD);
			
			// gather all results in rank 0
			MPI_Gather(&bandwidth, 1, MPI_DOUBLE, param_bw, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
			
			if (myid == 0)
			{
				sum_bw = 0;
				max_bw = 0;
				min_bw = param_bw[0];
				for (i = 0; i < param_limit; i+=param_skip)
				{
					if (output == OUT_PER_RANK)
					{
						if(order == 0){
							recive_rank = i+1;
						}else{
							recive_rank = i+midd;
						}
                        fprintf(stdout, "bw: [%d]->[%d]: %d\t%f\n", i, recive_rank, size_bw, param_bw[i]);
					}
					sum_bw += param_bw[i];
					if (param_bw[i] < min_bw)
					{
						min_bw = param_bw[i];
					}
					if (param_bw[i] > max_bw)
					{
						max_bw = param_bw[i];
					}
				}
				// print summary
				fprintf(stdout, "BW (%d) (size min max avg)  %d\t%f\t%f\t%f\n", numprocs, size_bw, min_bw, max_bw, sum_bw/midd);
			}
			MPI_Barrier(MPI_COMM_WORLD);
			
			if (bw_s_loop)
			{
				size_bw *= SIZE_MULT;
			}
			if (size_bw > DEF_BW_SIZE)
			{
				bw_s_loop = 0;
			}
		}
		while (bw_s_loop); // end of sizes loop
	} // end of bandwidth test

	if (test_lt)
	{
		if (lt_s_loop)
		{
			size_lt = 1;
		}
		
		// start latency test (skip some iterations at start)
		do
		{
			if (sender)
			{
				for (i = 0; i < loop+skip; i++)
				{
					if (i == skip)
						t_start = MPI_Wtime();
					
					MPI_Send(s_buf, size_lt, MPI_CHAR, remote, i, MPI_COMM_WORLD);
					MPI_Recv(r_buf, size_lt, MPI_CHAR, remote, i + 1000, MPI_COMM_WORLD, &stat);
				}
				t_end = MPI_Wtime();
			}
			else
			{
				for (i = 0; i < loop+skip; i++)
				{
					MPI_Recv(r_buf, size_lt, MPI_CHAR, remote, i, MPI_COMM_WORLD, &stat);
					MPI_Send(s_buf, size_lt, MPI_CHAR, remote, i + 1000, MPI_COMM_WORLD);
				}
			}
			
			// calculate result (time in usec)
			if (sender)
			{
				latency = (t_end - t_start) * 1.0e6 / (2.0 * loop);
			}
			MPI_Barrier(MPI_COMM_WORLD);

			// gather all results in rank 0
			MPI_Gather(&latency, 1, MPI_DOUBLE, param_lt, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
			
			if (myid == 0)
			{
				sum_lt = 0;
				max_lt = 0;
				min_lt = param_lt[0];
				for (i = 0; i < param_limit; i+=param_skip)
				{
					if (output == OUT_PER_RANK)
					{
						if(order == 0){
							recive_rank = i+1;
						}else{
							recive_rank = i+midd;
						}
						fprintf(stdout, "LT: [%d]->[%d]: %d\t%f\t\n", i, recive_rank, size_lt, param_lt[i]);
					}
					sum_lt += param_lt[i];
					if (param_lt[i] < min_lt)
					{
						min_lt = param_lt[i];
					}
					if (param_lt[i] > max_lt)
					{
						max_lt = param_lt[i];
					}
				}

				// print summary
				fprintf(stdout, "LT (%d) (size min max avg)  %d\t%f\t%f\t%f\n", numprocs, size_lt, min_lt, max_lt, sum_lt/(numprocs/2));
			}
			MPI_Barrier(MPI_COMM_WORLD);

			if (lt_s_loop)
			{
				size_lt *= SIZE_MULT;
			}
			if (size_lt >= DEF_BW_SIZE)
			{
				lt_s_loop = 0;
			}
		}
		while (lt_s_loop); // end of sizes loop
	} // end of latency test
	
	if(endless){
		if(myid == 0){
			fprintf(stdout, "---- I will run for ever, Ctrl-C to kill me ----\n");
			fprintf(stdout, "---- I will run for ever, Ctrl-C to kill me ----\n");
			fprintf(stdout, "---- I will run for ever, Ctrl-C to kill me ----\n");
		}
		goto do_for_ever;
	}
			
	
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Finalize();
	return 0;
}
